/*******************************************************************************
File    : Census_2001.do
Project : Bank Expansion and Moneylender Interest Rates - RDD Evidence from India
Purpose : Construct Census 2001 district-level population dataset (PC01_A01)
Author  : Kannan Narayanaswamy
Updated : 09 Feb 2026
Source  : Census table "PC01_A01 – Area, households and population"
          (Downloaded from Census of India tables portal)
*******************************************************************************/

/*--------------------------------------------------------------------------------------------------
   Constructing Census 2001 District–Level Population Dataset (PC01_A01)
   Author      : Kannan Narayanaswamy
   Created     : 03/09/2025
   Last Edited : 03/17/2025

   Source: Census table "PC01_A01 – Area, households and population"
   Download: https://censusindia.gov.in/census.website/data/census-tables
--------------------------------------------------------------------------------------------------*/

cls

*--------------------------------------------------------------------------------------
* 1. Set directory and import raw Excel
*--------------------------------------------------------------------------------------
cd "`c(pwd)'\Data\MOF_Data\Census 2001"

import excel "PC01_A01.xls", sheet("A1-Final") cellrange(A2:M4081) firstrow clear

*--------------------------------------------------------------------------------------
* 2. Basic cleaning: drop extra rows/cols and rename variables
*--------------------------------------------------------------------------------------

* Rename main variables
rename NumberofVillages______ no_villages_inhabited
rename D                      no_villages_uninhabited
rename Population             pop_total
rename H                      pop_male
rename I                      pop_female

* Drop unused columns
drop L M

* Drop obviously extraneous rows (header/footer junk from import)
drop in 2862/2891
drop in 1/153

*--------------------------------------------------------------------------------------
* 3. Classify rows as STATE/UT vs DISTRICT & drop non-data rows
*--------------------------------------------------------------------------------------
gen Level = ""
replace Level = "STATE/UT" if IndiaStateUnionTerritory == upper(IndiaStateUnionTerritory)
replace Level = "DISTRICT" if IndiaStateUnionTerritory != upper(IndiaStateUnionTerritory)

* Drop header and non-data text rows
drop if inlist(IndiaStateUnionTerritory, ///
    "A-1    NUMBER OF VILLAGES,TOWNS, HOUSEHOLDS, POPULATION AND AREA", ///
    "India/ State/  Union Territory/ District", "", "1")

*--------------------------------------------------------------------------------------
* 4. Fix names that spilled across rows (manual corrections)
*--------------------------------------------------------------------------------------
replace IndiaStateUnionTerritory = "JAMMU & KASHMIR"                   in 1
replace IndiaStateUnionTerritory = "HIMACHAL PRADESH"                  in 17
replace IndiaStateUnionTerritory = "Udham Singh Nagar"                 in 63
replace IndiaStateUnionTerritory = "Jyotiba Phule Nagar"               in 135
replace IndiaStateUnionTerritory = "Gautam Buddha Nagar"               in 140
replace IndiaStateUnionTerritory = "Sant Ravidas Nagar (Bhadohi)"      in 199
replace IndiaStateUnionTerritory = "Pashchim Champaran"                in 204
replace IndiaStateUnionTerritory = "North Twenty Four Parganas"        in 339
replace IndiaStateUnionTerritory = "South  Twenty Four Parganas"       in 347
replace IndiaStateUnionTerritory = "Pashchimi Singhbhum"               in 366
replace IndiaStateUnionTerritory = "DADRA & NAGAR HAVELI"              in 492
replace IndiaStateUnionTerritory = "Dadra & Nagar Haveli"              in 494
replace IndiaStateUnionTerritory = "Mumbai (Suburban)"                 in 518
replace IndiaStateUnionTerritory = "ANDAMAN & NICOBAR ISLANDS"         in 641

* Drop rural / blank rows and the Rural/Urban indicator
drop if inlist(TotalRuralUrban, "Rural", "")
drop TotalRuralUrban

* Clean area variable (remove footnote asterisks)
replace Areainsqkm = subinstr(Areainsqkm, "*", "", .)

* Convert numerics
destring no_villages_inhabited no_villages_uninhabited ///
         Numberoftowns Numberofhouseholds ///
         pop_total pop_male pop_female ///
         Areainsqkm Populationpersqkm, replace

*--------------------------------------------------------------------------------------
* 5. Create and clean State_UT variable
*--------------------------------------------------------------------------------------
gen State_UT = ""
replace State_UT = strtrim(proper(strlower(IndiaStateUnionTerritory))) if Level == "STATE/UT"
replace State_UT = State_UT[_n-1]                                     if Level == "DISTRICT"
replace State_UT = subinstr(State_UT, "  ", " ", .)

* Fix special cases
replace State_UT = "Manipur"           if State_UT == "Manipur  +"
replace State_UT = "Arunachal Pradesh" if State_UT == "Arunachal"

*--------------------------------------------------------------------------------------
* 6. Handle Delhi aggregation (RBI 2005 Q1 only has Delhi at state level)
*--------------------------------------------------------------------------------------
drop if Level == "STATE/UT" & State_UT != "N.C.T. Delhi"
drop if Level == "DISTRICT" & State_UT == "N.C.T. Delhi"

replace State_UT                = "Delhi"   if State_UT == "N.C.T. Delhi"
replace IndiaStateUnionTerritory = "New Delhi" if State_UT == "Delhi"

*--------------------------------------------------------------------------------------
* 7. Construct District / Parent_District & handle special cases (Mumbai, Ariyalur)
*--------------------------------------------------------------------------------------
gen District        = strtrim(substr(IndiaStateUnionTerritory, 1, 30))
gen Parent_District = District

* Combine Mumbai city + Mumbai (Suburban) into Greater Mumbai
replace Parent_District = "Greater Mumbai" if District == "Mumbai"
replace Parent_District = "Greater Mumbai" if District == "Mumbai (Suburban)"

* Ariyalur existed briefly, then merged into Perambalur; treat as Perambalur
replace Parent_District = "Perambalur" if District == "Ariyalur"

drop District
rename Parent_District District

*--------------------------------------------------------------------------------------
* 8. Collapse to district level (by State_UT × District)
*--------------------------------------------------------------------------------------
collapse (sum) ///
    no_villages_inhabited no_villages_uninhabited ///
    Numberoftowns Numberofhouseholds ///
    pop_total pop_male pop_female ///
    Areainsqkm Populationpersqkm, ///
    by(State_UT District)

*--------------------------------------------------------------------------------------
* 9. Generate district ID and save
*--------------------------------------------------------------------------------------
gen ID_Census = _n
order ID_Census, before(State_UT)

save "Census_2001.dta", replace
export excel "Census_2001.xlsx", firstrow(variables) replace

cd ..\..\..